train_idx <- sample(1:nrow(movies), size = floor(0.75 * nrow(movies)))
movies_train <- movies %>% slice(train_idx)
movies_test <- movies %>% slice(-train_idx)Where 0.75 is the percentage (75%) of the data to put in the Training set.
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## num [1:7] 1 4 7 10 13 16 19
## [1] 1 2 3 4 100 58 5568
## num [1:7] 1 2 3 4 100 ...
## [1] "hey!"
## [1] "jon" "Peter" "Sam"
## chr [1:3] "jon" "Peter" "Sam"
## chr [1:4] "Dec" "May" "Apr" "Dec"
## Factor w/ 3 levels "Apr","Dec","May": 2 3 1 2
## birth_month
## Apr Dec May
## 1 2 1
month_levels <- c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep",
"Oct", "Nov", "Dec")
birth_month <- factor(birth_month, levels = month_levels)
str(birth_month)## Factor w/ 12 levels "Jan","Feb","Mar",..: 12 5 4 12
## [1] Dec May Apr
## Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## birth_month
## Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 0 0 0 1 1 0 0 0 0 0 0 2
Can view help by vignette("dplyr") and vignette("two-table") or check out the online docs dplyr is a part of tidyverse
Can Use | as or
##
## Aboriginal Arabic Aramaic Bosnian Cantonese
## 12 2 5 1 1 11
## Chinese Czech Danish Dari Dutch Dzongkha
## 3 1 5 2 4 1
## English Filipino French German Greek Hebrew
## 4704 1 73 19 1 5
## Hindi Hungarian Icelandic Indonesian Italian Japanese
## 28 1 2 2 11 18
## Kannada Kazakh Korean Mandarin Maya Mongolian
## 1 1 8 26 1 1
## None Norwegian Panjabi Persian Polish Portuguese
## 2 4 1 4 4 8
## Romanian Russian Slovenian Spanish Swahili Swedish
## 2 11 1 40 1 5
## Tamil Telugu Thai Urdu Vietnamese Zulu
## 1 1 3 1 1 2
moviesSub <- movies %>% filter(language == "English" | language == "Spanish")
moviesBig_Spanish <- movies %>% filter(language == "Spanish" | budget > 10000000000)
movies_eng_sp <- movies %>% filter(language == "English" | language == "Spanish") %>%
filter(budget > 100000000)
dim(movies_eng_sp)## [1] 308 28
Order the rows
Only see certain rows
Used to pick out certain variables
## [1] 5043 4
## [1] "movie_title" "director_name" "gross" "budget"
## [1] "movie_title" "director_name"
## [3] "gross" "budget"
## [5] "color" "num_critic_for_reviews"
## [7] "duration" "director_facebook_likes"
## [9] "actor_3_facebook_likes" "actor_2_name"
## [11] "actor_1_facebook_likes" "genres"
## [13] "actor_1_name" "num_voted_users"
## [15] "cast_total_facebook_likes" "actor_3_name"
## [17] "facenumber_in_poster" "plot_keywords"
## [19] "movie_imdb_link" "num_user_for_reviews"
## [21] "language" "country"
## [23] "content_rating" "title_year"
## [25] "actor_2_facebook_likes" "imdb_score"
## [27] "aspect_ratio" "movie_facebook_likes"
starts_with(): Starts with a prefix.ends_with(): Ends with a suffix.contains(): Contains a literal string.matches(): Matches a regular expression.num_range(): Matches a numerical range like x01, x02, x03.one_of(): Matches variable names in a character vector.everything(): Matches all variables.last_col(): Select last variable, possibly with an offset.## [1] "movie_title" "director"
## [3] "gross" "budget"
## [5] "color" "num_critic_for_reviews"
## [7] "duration" "director_facebook_likes"
## [9] "actor_3_facebook_likes" "actor_2_name"
## [11] "actor_1_facebook_likes" "genres"
## [13] "actor_1_name" "num_voted_users"
## [15] "cast_total_facebook_likes" "actor_3_name"
## [17] "facenumber_in_poster" "plot_keywords"
## [19] "movie_imdb_link" "num_user_for_reviews"
## [21] "language" "country"
## [23] "content_rating" "title_year"
## [25] "actor_2_facebook_likes" "imdb_score"
## [27] "aspect_ratio" "movie_facebook_likes"
Director_Tot <- movies %>% group_by(director) %>% summarize(grossTotDir = sum(grossM,
na.rm = TRUE))
head(Director_Tot)Director_Stats <- movies %>% group_by(director) %>% summarize(n = n(), min = min(grossM),
max = max(grossM), avg = mean(grossM), sd = sd(grossM))
Director_Stats %>% arrange(desc(max)) %>% slice(1:10)movies <- movies %>% mutate(genre_main = unlist(map(strsplit(as.character(movies$genres),
"\\|"), 1)), grossM = gross/1000000, budgetM = budget/1000000, profitM = grossM -
budgetM, ROI = profitM/budgetM)
movies <- movies %>% mutate(genre_main = factor(genre_main) %>% fct_drop())
Director_Avg <- movies %>% group_by(director) %>% summarize(num_movies = n(),
grossAvgDir = mean(grossM, na.rm = TRUE), profitAvgDir = mean(profit, na.rm = TRUE))
Director_Avg %>% arrange(desc(profitAvgDir)) %>% filter(num_movies > 1) %>%
slice(1:20)Director_Tot <- movies %>% group_by(director) %>% summarize(grossTotDir = sum(grossM,
na.rm = TRUE))
genre_collapse <- movies %>% group_by(genre_main) %>% summarize(Avg_ROI_genre = mean(ROI,
na.rm = TRUE), SD_ROI_genre = sd(ROI, na.rm = TRUE), SE_ROI_genre = sd(ROI,
na.rm = TRUE)/sqrt(n()), num_films = n())
actor_sum <- movies %>% group_by(actor_1_name) %>% summarize(Avg_ROI_actor = mean(ROI,
na.rm = TRUE), SD_ROI_actor = sd(ROI, na.rm = TRUE), SE_ROI_actor = sd(ROI,
na.rm = TRUE)/sqrt(n()), num_films = n())
actor_sum %>% filter(num_films > 2) %>% top_n(5, wt = Avg_ROI_actor)## [1] 0.2909999 0.2755208 2.0667876 12.3284782 0.4697230 0.2315017
## [7] 1.6732844 0.2963527 7.1413626 1.1121905 NA NA
## [13] 40.1424230 11.4088478 0.4763281 1.0834822 0.6181190 1.4418917
## [19] 0.4705548
Can view help by vignette(“magrittr”) or check out the online docs magrittr is a part of tidyverse
We start with a value, here mtcars (a data.frame). Based on this, we first extract a subset, then we aggregate the information based on the number of cylinders, and then we transform the dataset by adding a variable for kilometers per liter as supplement to miles per gallon. Finally we print the result before assigning it. Note how the code is arranged in the logical order of how you think about the task: data->transform->aggregate, which is also the same order as the code will execute. It’s like a recipe – easy to read, easy to follow!
library(magritter)
car_data <- mtcars %>% subset(hp > 100) %>% aggregate(. ~ cyl, data = ., FUN = . %>%
mean %>% round(2)) %>% transform(kpl = mpg %>% multiply_by(0.4251)) %>%
print## cyl mpg disp hp drat wt qsec vs am gear carb kpl
## 1 4 25.90 108.05 111.00 3.94 2.15 17.75 1.00 1.00 4.50 2.00 11.010090
## 2 6 19.74 183.31 122.29 3.59 3.12 17.98 0.57 0.43 3.86 3.43 8.391474
## 3 8 15.10 353.10 209.21 3.23 4.00 16.77 0.00 0.14 3.29 3.50 6.419010
Note also how “building” a function on the fly for use in aggregate is very simple in magrittr: rather than an actual value as left-hand side in pipeline, just use the placeholder. This is also very useful in R’s *apply family of functions.
The combined example shows a few neat features of the pipe (which it is not):
%>% may be used in a nested fashion, e.g. it may appear in expressions within arguments. This is used in the mpg to kpl conversion.One feature, which was not utilized above is piping into anonymous functions, or lambdas. This is possible using standard function definitions, e.g.
However, magrittr also allows a short-hand notation:
The “tee” operator, %T>% works like %>%, except it returns the left-hand side value, and not the result of the right-hand side operation. This is useful when a step in a pipeline is used for its side-effect (printing, plotting, logging, etc.). As an example:
## [1] -4.63495 17.95214
The “exposition” pipe operator, %$% exposes the names within the left-hand side object to the right-hand side expression. Essentially, it is a short-hand for using the with functions (and the same left-hand side objects are accepted). This operator is handy when functions do not themselves have a data argument, as for example lm and aggregate do. Here are a few examples as illustration:
## [1] 0.3365679
Finally, the compound assignment pipe operator %<>% can be used as the first pipe in a chain. The effect will be that the result of the pipeline is assigned to the left-hand side object, rather than returning the result as usual. It is essentially shorthand notation for expressions like foo <- foo %>% bar %>% baz, which boils down to foo %<>% bar %>% baz. Another example is
The %<>% can be used whenever expr <- … makes sense, e.g.
x %<>% foo %>% barx[1:10] %<>% foo %>% barx$baz %<>% foo %>% barggplot2 is a system for declaratively creating graphics, based on The Grammar of Graphics. You provide the data, tell ggplot2 how to map variables to aesthetics, what graphical primitives to use, and it takes care of the details.
It’s hard to succinctly describe how ggplot2 works because it embodies a deep philosophy of visualisation. However, in most cases you start with ggplot(), supply a dataset and aesthetic mapping (with aes()). You then add on layers (like geom_point() or geom_histogram()), scales (like scale_colour_brewer()), faceting specifications (like facet_wrap()) and coordinate systems (like coord_flip()).
geom_bar makes the height of the bar proportional to the number of cases in each group (or if the weight aesthetic is supplied, the sum of the weights). If you want the heights of the bars to represent values in the data, use geom_col() instead. geom_bar() uses stat_count() by default: it counts the number of cases at each x position. geom_col() uses stat_identity(): it leaves the data as is.
ggplot(gmsc_train, aes(y = SeriousDlqin2yrs, x = age)) + geom_bar(stat = "identity",
fill = "red") + ggtitle("Serious Delinquencies in 2 years vs By Age") +
labs(x = "Age", y = "Serious Delinquencies in 2 years")ggplot(gmsc_train, aes(y = SeriousDlqin2yrs, x = NumberRealEstateLoansOrLines)) +
geom_col(fill = "red") + ggtitle("Serious Delinquencies in 2 years vs Number of Loans or Lines") +
labs(x = "Number of Real Estate Loans Or Lines", y = "Serious Delinquencies in 2 years")The jitter geom is a convenient shortcut for geom_point(position = "jitter"). It adds a small amount of random variation to the location of each point, and is a useful way of handling overplotting caused by discreteness in smaller datasets.
ggplot(gmsc_train, aes(x = MonthlyIncome, y = SeriousDlqin2yrs)) + geom_jitter(alpha = 1/10,
color = "hotpink") + xlim(0, 100000) + ggtitle("Serious Delinquencies in 2 years vs Monthly Income") +
labs(x = "Monthly Income", y = "Serious Delinquencies in 2 years")## Warning: Removed 18384 rows containing missing values (geom_point).
Visualise the distribution of a single continuous variable by dividing the x axis into bins and counting the number of observations in each bin. Histograms (geom_histogram()) display the counts with bars; frequency polygons (geom_freqpoly()) display the counts with lines. Frequency polygons are more suitable when you want to compare the distribution across the levels of a categorical variable.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
mpg %>% ggplot(aes(x = reorder(class, hwy), y = hwy, fill = class)) + geom_boxplot() +
xlab("class") + theme(legend.position = "none")In a line graph, observations are ordered by x value and connected.
The functions geom_line(), geom_step(), or geom_path() can be used.
x value (for x axis) can be :
Data derived from ToothGrowth data sets are used. ToothGrowth describes the effect of Vitamin C on tooth growth in Guinea pigs.
See more here
# Basic line plot with points
ggplot(data = df, aes(x = dose, y = len, group = 1)) + geom_line() + geom_point()
# Change the line type
ggplot(data = df, aes(x = dose, y = len, group = 1)) + geom_line(linetype = "dashed") +
geom_point()
# Change the color
ggplot(data = df, aes(x = dose, y = len, group = 1)) + geom_line(color = "red") +
geom_point()Observations can be also connected using the functions geom_step() or geom_path() :
ggplot(data = df, aes(x = dose, y = len, group = 1)) + geom_step() + geom_point()
ggplot(data = df, aes(x = dose, y = len, group = 1)) + geom_path() + geom_point()ggplot(Genre_ROI, aes(y = ROI_avg, x = genre_main)) + geom_bar(stat = "identity") +
coord_flip() + ggtitle("Best Performing Genre By ROI") + geom_errorbar(mapping = aes(ymin = ROI_avg -
se_ROI, ymax = ROI_avg + se_ROI))## Warning: Removed 2 rows containing missing values (position_stack).
## Warning: Removed 2 rows containing missing values (geom_errorbar).
library("ggthemes") docs
Base example ggplot() + geom_point(data = iris, aes(x = Petal.Width, y = Petal.Length,color = Species))
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
# For the complete list of themes in the package check
ls("package:ggthemes")[grepl("theme_", ls("package:ggthemes"))]## [1] "theme_base" "theme_calc"
## [3] "theme_clean" "theme_economist"
## [5] "theme_economist_white" "theme_excel"
## [7] "theme_excel_new" "theme_few"
## [9] "theme_fivethirtyeight" "theme_foundation"
## [11] "theme_gdocs" "theme_hc"
## [13] "theme_igray" "theme_map"
## [15] "theme_pander" "theme_par"
## [17] "theme_solarized" "theme_solarized_2"
## [19] "theme_solid" "theme_stata"
## [21] "theme_tufte" "theme_wsj"
# Theme similar to the default settings of LibreOffice Calc
ggplot() + geom_point(data = iris, aes(x = Petal.Width, y = Petal.Length, color = Species)) +
theme_calc()# Style plots similar to those in The Economist
ggplot() + geom_point(data = iris, aes(x = Petal.Width, y = Petal.Length, color = Species)) +
theme_economist()# Style plots similar to those in The Economist
ggplot() + geom_point(data = iris, aes(x = Petal.Width, y = Petal.Length, color = Species)) +
theme_economist_white()# Theme based on the rules and examples in Stephen Few, 'Practical rules for
# using colors in charts'
ggplot() + geom_point(data = iris, aes(x = Petal.Width, y = Petal.Length, color = Species)) +
theme_few()library(ISLR)
library(tidyverse)
data("Default")
library(caret)
logit_fit3 <- glm(default ~ balance, family = binomial, data = Default)
preds_DF <- data.frame(preds = predict(logit_fit3, type = "response"), true = factor(Default$default,
levels = c("Yes", "No")))
creditLift <- lift(true ~ preds, data = preds_DF)
xyplot(creditLift, main = "X = balance")library("caret")
logit_fit2 <- glm(default ~ student, family = binomial, data = Default)
preds_DF <- data.frame(preds = predict(logit_fit2, type = "response"), true = factor(Default$default,
levels = c("Yes", "No")))
creditLift <- lift(true ~ preds, data = preds_DF)
xyplot(creditLift, main = "X = student")scores3DF <- data.frame(default = ifelse(Default$default == "Yes", 1, 0), scores = predict(logit_fit3,
type = "response"))
library(plyr)
calData <- ddply(scores3DF, .(cut(scores3DF$scores, c(0, 0.05, 0.15, 0.25, 0.35,
0.45, 0.55, 0.65, 0.75, 0.85, 0.95, 1))), colwise(mean))
calData$midpoint <- c(0.025, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.975)
colnames(calData) <- c("preds", "true", "midpoint")
calPlot <- ggplot(calData, aes(x = midpoint, y = true)) + geom_point() + ylim(0,
1) + geom_abline(intercept = 0, slope = 1, color = "red") + xlab("Prediction midpoint") +
ylab("Observed event percentage")
plot(calPlot)library("ROSE")
data_rose_down <- ROSE(default ~ ., data = Default, N = 666, p = 1/2)
table(data_rose_down$data$default)##
## No Yes
## 344 322
data_rose_up <- ROSE(default ~ ., data = Default, N = 12000, p = 1/2)
table(data_rose_up$data$default)##
## No Yes
## 5951 6049
# logit downsampled model
logit_down <- glm(default ~ balance, data = data_rose_down$data, family = "binomial")
summary(logit_down)##
## Call:
## glm(formula = default ~ balance, family = "binomial", data = data_rose_down$data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.72103 -0.37663 -0.03521 0.39108 2.53476
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.0444298 0.5681151 -12.40 <0.0000000000000002 ***
## balance 0.0052319 0.0004063 12.88 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 922.55 on 665 degrees of freedom
## Residual deviance: 400.59 on 664 degrees of freedom
## AIC: 404.59
##
## Number of Fisher Scoring iterations: 6
# logit up-sampled
logit_up <- glm(default ~ balance, data = data_rose_up$data, family = "binomial")
summary(logit_up)##
## Call:
## glm(formula = default ~ balance, family = "binomial", data = data_rose_up$data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.3553 -0.3661 0.0559 0.4392 2.9928
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.7472081 0.1271210 -53.08 <0.0000000000000002 ***
## balance 0.0051232 0.0000918 55.81 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 16634.7 on 11999 degrees of freedom
## Residual deviance: 7465.1 on 11998 degrees of freedom
## AIC: 7469.1
##
## Number of Fisher Scoring iterations: 6
# vanilla logit
logit <- glm(default ~ balance, data = Default, family = "binomial")
# generate scores and class predictions
scores_down = predict(logit_down, type = "response")
scores_up = predict(logit_up, type = "response")
scores_reg = predict(logit, type = "response")
class_down = ifelse(scores_down > 0.5, 1, 0)
class_up = ifelse(scores_up > 0.5, 1, 0)
class_reg = ifelse(scores_reg > 0.5, 1, 0)
# > diagnostics(table(data_rose_down$data$default,class_down)) Accuracy:
# 0.872 TP: 307 TN: 274 Sensitivity: 0.9 Specificity: 0.843 False Pos Rate:
# 0.157 > diagnostics(table(data_rose_up$data$default,class_up)) Accuracy:
# 0.869 TP: 5282 TN: 5149 Sensitivity: 0.88 Specificity: 0.859 False Pos
# Rate: 0.141 > diagnostics(table(Default$default,class_reg))# Auto dataset
library(ISLR)
library(tidyverse)
set.seed(1861)
data(Auto)
Auto_sub <- Auto %>% select(-name)
head(Auto_sub)# for loop of model
mods_LOOCV <- list()
preds_LOOCV <- NULL
for (i in 1:nrow(Auto)) {
mod = lm(mpg ~ ., data = Auto_sub %>% slice(-i))
preds_LOOCV[i] <- predict(mod, newdata = slice(Auto_sub, i))
mods_LOOCV[[i]] <- mod
}
head(preds_LOOCV)## [1] 14.92963 13.98084 15.17965 15.04054 14.91319 10.54207
mod_insample <- lm(mpg ~ ., data = Auto_sub)
# compute RMSE LOOCV
preds_DF <- data.frame(preds_LOOCV = preds_LOOCV, preds_insample = predict(mod_insample),
true = Auto$mpg)
library(caret)
RMSE(preds_DF$preds_LOOCV, preds_DF$true)## [1] 3.37211
## [1] 3.293551
## [1] 0.8128915
## [1] 0.8214781
Auto_sub <- mutate(Auto_sub, folds = createFolds(Auto_sub$mpg, k = 10, list = FALSE))
### K-Fold Cross Validation
nfolds <- 10
preds_10FoldCV_DF <- data.frame(folds = Auto_sub$folds, preds_10FoldCV = rep(NA,
nrow(Auto_sub)))
for (i in 1:nfolds) {
mod <- lm(mpg ~ ., data = Auto_sub %>% filter(folds != i))
preds <- predict(mod, newdata = filter(Auto_sub, folds == i))
preds_10FoldCV_DF[preds_10FoldCV_DF$folds == i, "preds_10FoldCV"] <- preds
}
preds_DF <- data.frame(preds_10FoldCV = preds_10FoldCV_DF$preds_10FoldCV, preds_DF)
RMSE(preds_DF$preds_10FoldCV, preds_DF$true)## [1] 3.349023
## [1] 3.37211
## [1] 3.293551
## [1] 0.8154466
## [1] 0.8128915
## [1] 0.8214781
B = 100 # number of bootstraped datasets
n_boot = 200 # size of each bootstrapped sample
coef_boot = NULL
for (b in 1:B) {
idx <- sample(1:nrow(Auto_sub), size = n_boot, replace = TRUE)
mod <- lm(mpg ~ displacement, data = Auto_sub %>% slice(idx))
coef_boot[b] <- mod$coefficients[2]
}
mod_lm <- lm(mpg ~ displacement, data = Auto_sub)
coef_boot <- data.frame(coef_boot = coef_boot)
ggplot(coef_boot, aes(x = coef_boot)) + geom_histogram() + geom_vline(xintercept = mod_lm$coefficients[2],
color = "red")## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
library(rsample)
library(broom)
library(purrr)
library(boot)
boots <- bootstraps(Auto_sub, times = 100)
bootsfit_lm_on_boots <- function(split) {
lm(mpg ~ displacement - 1, data = analysis(split))
}
boot_mods <- boots %>% mutate(model = map(splits, fit_lm_on_boots), coef_info = map(model,
tidy))
boot_coefs <- boot_mods %>% unnest(coef_info)
boot_coefslibrary("leaps")
auto_fit_fwd <- regsubsets(mpg ~ ., data = Auto_sub, nvmax = 7, method = "forward")
summary(auto_fit_fwd)## Subset selection object
## Call: regsubsets.formula(mpg ~ ., data = Auto_sub, nvmax = 7, method = "forward")
## 8 Variables (and intercept)
## Forced in Forced out
## cylinders FALSE FALSE
## displacement FALSE FALSE
## horsepower FALSE FALSE
## weight FALSE FALSE
## acceleration FALSE FALSE
## year FALSE FALSE
## origin FALSE FALSE
## folds FALSE FALSE
## 1 subsets of each size up to 7
## Selection Algorithm: forward
## cylinders displacement horsepower weight acceleration year origin
## 1 ( 1 ) " " " " " " "*" " " " " " "
## 2 ( 1 ) " " " " " " "*" " " "*" " "
## 3 ( 1 ) " " " " " " "*" " " "*" "*"
## 4 ( 1 ) " " "*" " " "*" " " "*" "*"
## 5 ( 1 ) " " "*" "*" "*" " " "*" "*"
## 6 ( 1 ) "*" "*" "*" "*" " " "*" "*"
## 7 ( 1 ) "*" "*" "*" "*" " " "*" "*"
## folds
## 1 ( 1 ) " "
## 2 ( 1 ) " "
## 3 ( 1 ) " "
## 4 ( 1 ) " "
## 5 ( 1 ) " "
## 6 ( 1 ) " "
## 7 ( 1 ) "*"
auto_fit_bkwd <- regsubsets(mpg ~ ., data = Auto_sub, nvmax = 7, method = "backward")
summary(auto_fit_bkwd)## Subset selection object
## Call: regsubsets.formula(mpg ~ ., data = Auto_sub, nvmax = 7, method = "backward")
## 8 Variables (and intercept)
## Forced in Forced out
## cylinders FALSE FALSE
## displacement FALSE FALSE
## horsepower FALSE FALSE
## weight FALSE FALSE
## acceleration FALSE FALSE
## year FALSE FALSE
## origin FALSE FALSE
## folds FALSE FALSE
## 1 subsets of each size up to 7
## Selection Algorithm: backward
## cylinders displacement horsepower weight acceleration year origin
## 1 ( 1 ) " " " " " " "*" " " " " " "
## 2 ( 1 ) " " " " " " "*" " " "*" " "
## 3 ( 1 ) " " " " " " "*" " " "*" "*"
## 4 ( 1 ) " " "*" " " "*" " " "*" "*"
## 5 ( 1 ) " " "*" "*" "*" " " "*" "*"
## 6 ( 1 ) "*" "*" "*" "*" " " "*" "*"
## 7 ( 1 ) "*" "*" "*" "*" " " "*" "*"
## folds
## 1 ( 1 ) " "
## 2 ( 1 ) " "
## 3 ( 1 ) " "
## 4 ( 1 ) " "
## 5 ( 1 ) " "
## 6 ( 1 ) " "
## 7 ( 1 ) "*"
You can use str to get info about what is contained in a model ie: str(mod1) ## Setup Test/Train
train_idx <- sample(1:nrow(movies), size = floor(0.75 * nrow(movies)))
movies_train <- movies %>% slice(train_idx)
movies_test <- movies %>% slice(-train_idx)Where 0.75 is the percentage (75%) of the data to put in the Training set.
Linear regression is a linear approach to modeling the relationship between a scalar response (or dependent variable) and one or more explanatory variables (or independent variables). The case of one explanatory variable is called simple linear regression. Generate a linear model with lm(), desired formula is written with the dependant variable followed by ~ and then a list of the independant variables Can use . for all, or do something like y ~ -director Can get the coefficients like this mod1$coefficients[1]
##
## Call:
## lm(formula = gross ~ budget + duration, data = movies_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -398660902 -23412577 -9492452 12647133 495725939
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -12355521.86555 4930418.20639 -2.506 0.0123
## budget 0.99734 0.02389 41.752 < 0.0000000000000002
## duration 229023.81048 45694.54514 5.012 0.000000571
##
## (Intercept) *
## budget ***
## duration ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 53860000 on 2905 degrees of freedom
## (496 observations deleted due to missingness)
## Multiple R-squared: 0.4172, Adjusted R-squared: 0.4168
## F-statistic: 1040 on 2 and 2905 DF, p-value: < 0.00000000000000022
Logistic regression is a statistical model that in its basic form uses a logistic function to model a binary dependent variable, although many more complex extensions exist. In regression analysis, logistic regression (or logit regression) is estimating the parameters of a logistic model (a form of binary regression). Use function glm() notice the family = binomial
library(ISLR)
data(Default)
options(scipen = 9)
logitMod1 <- glm(factor(default) ~ balance, family = binomial, data = Default)
summary(logitMod1)##
## Call:
## glm(formula = factor(default) ~ balance, family = binomial, data = Default)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2697 -0.1465 -0.0589 -0.0221 3.7589
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.6513306 0.3611574 -29.49 <2e-16 ***
## balance 0.0054989 0.0002204 24.95 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1596.5 on 9998 degrees of freedom
## AIC: 1600.5
##
## Number of Fisher Scoring iterations: 8
## (Intercept) balance
## -10.6513 0.0055
library(glmnet)
library(glmnetUtils)
# load the movies dataset
library("tidyverse")
options(scipen = 50)
set.seed(1861)
movies <- read.csv(here::here("datasets", "movie_metadata.csv"))
movies <- movies %>% filter(budget < 400000000) %>% filter(content_rating !=
"", content_rating != "Not Rated", !is.na(gross))
movies <- movies %>% mutate(genre_main = unlist(map(strsplit(as.character(movies$genres),
"\\|"), 1)), grossM = gross/1000000, budgetM = budget/1000000, profitM = grossM -
budgetM)
movies <- movies %>% mutate(genre_main = fct_lump(genre_main, 5), content_rating = fct_lump(content_rating,
3), country = fct_lump(country, 2), cast_total_facebook_likes000s = cast_total_facebook_likes/1000,
) %>% drop_na()
train_idx <- sample(1:nrow(movies), size = floor(0.75 * nrow(movies)))
movies_train <- movies %>% slice(train_idx)
movies_test <- movies %>% slice(-train_idx)
# estimate ridge mod
Ridge_mod <- cv.glmnet(profitM ~ ., data = movies_train %>% select(-c(director_name,
actor_1_name, actor_2_name, actor_3_name, plot_keywords, movie_imdb_link,
country, budgetM, grossM, genres, language, movie_title)), alpha = 0)
coef(Ridge_mod)## 31 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 353.116523189860
## color 3.220429457000
## color Black and White -1.230889234459
## colorColor 1.273946091430
## num_critic_for_reviews -0.004287598868
## duration -0.059435354254
## director_facebook_likes -0.000095603299
## actor_3_facebook_likes 0.000377629504
## actor_1_facebook_likes -0.000046854225
## gross 0.000000808363
## num_voted_users 0.000026098607
## cast_total_facebook_likes 0.000011936125
## facenumber_in_poster 0.097107398006
## num_user_for_reviews 0.001753590606
## content_ratingPG 1.541944591659
## content_ratingPG-13 -0.081014605835
## content_ratingR -0.790656664619
## content_ratingOther 0.262087870726
## budget -0.000000764324
## title_year -0.175902433003
## actor_2_facebook_likes -0.000072922047
## imdb_score 0.793676412085
## aspect_ratio -1.260327725766
## movie_facebook_likes 0.000031219836
## genre_mainAction -2.508682376558
## genre_mainAdventure -0.949174857631
## genre_mainComedy 1.857795657356
## genre_mainCrime -0.752570921384
## genre_mainDrama 0.358810332201
## genre_mainOther 1.660817121433
## cast_total_facebook_likes000s 0.021206225810
library(glmnet)
library(glmnetUtils)
# load the movies dataset
library("tidyverse")
options(scipen = 50)
set.seed(1861)
movies <- read.csv(here::here("datasets", "movie_metadata.csv"))
movies <- movies %>% filter(budget < 400000000) %>% filter(content_rating !=
"", content_rating != "Not Rated", !is.na(gross))
movies <- movies %>% mutate(genre_main = unlist(map(strsplit(as.character(movies$genres),
"\\|"), 1)), grossM = gross/1000000, budgetM = budget/1000000, profitM = grossM -
budgetM)
movies <- movies %>% mutate(genre_main = fct_lump(genre_main, 5), content_rating = fct_lump(content_rating,
3), country = fct_lump(country, 2), cast_total_facebook_likes000s = cast_total_facebook_likes/1000) %>%
drop_na() %>% select(-c(director_name, actor_1_name, actor_2_name, actor_3_name,
plot_keywords, movie_imdb_link, country, budgetM, grossM, genres, language,
movie_title, budget, gross))
train_idx <- sample(1:nrow(movies), size = floor(0.75 * nrow(movies)))
movies_train <- movies %>% slice(train_idx)
movies_test <- movies %>% slice(-train_idx)
# estimate Lasso mod
Lasso_mod <-
cv.glmnet(profitM ~ ., data = movies_train, alpha = 1)
coef(Lasso_mod, s = Lasso_mod$lambda.min)## 29 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 1704.5386926912
## color 7.8685464686
## color Black and White -14.0683095963
## colorColor .
## num_critic_for_reviews -0.0114947113
## duration -0.2776782023
## director_facebook_likes -0.0005768620
## actor_3_facebook_likes -0.0095188709
## actor_1_facebook_likes -0.0081385588
## num_voted_users 0.0001681242
## cast_total_facebook_likes 0.0080395751
## facenumber_in_poster 0.2666746699
## num_user_for_reviews 0.0075827386
## content_ratingPG 10.8248014912
## content_ratingPG-13 .
## content_ratingR -7.3220267644
## content_ratingOther 1.7707853864
## title_year -0.8436939485
## actor_2_facebook_likes -0.0083079361
## imdb_score 3.1091588898
## aspect_ratio -5.7165591209
## movie_facebook_likes 0.0001270866
## genre_mainAction -11.7644203113
## genre_mainAdventure -2.7208186957
## genre_mainComedy 6.1636778997
## genre_mainCrime -6.8162401714
## genre_mainDrama .
## genre_mainOther 5.3857834335
## cast_total_facebook_likes000s 0.0141678626
## 29 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 987.3257625732
## color .
## color Black and White .
## colorColor .
## num_critic_for_reviews .
## duration -0.0650488589
## director_facebook_likes .
## actor_3_facebook_likes 0.0010865538
## actor_1_facebook_likes .
## num_voted_users 0.0001630243
## cast_total_facebook_likes .
## facenumber_in_poster .
## num_user_for_reviews .
## content_ratingPG 5.7811040203
## content_ratingPG-13 .
## content_ratingR -4.3764504437
## content_ratingOther .
## title_year -0.4879797143
## actor_2_facebook_likes .
## imdb_score .
## aspect_ratio -2.7584906529
## movie_facebook_likes .
## genre_mainAction -5.3686769625
## genre_mainAdventure .
## genre_mainComedy 3.9983665305
## genre_mainCrime .
## genre_mainDrama .
## genre_mainOther .
## cast_total_facebook_likes000s .
# put in a matrix
coef_mat <- data.frame(varname = rownames(coef(Lasso_mod)) %>% data.frame(),
Lasso_min = as.matrix(coef(Lasso_mod, s = Lasso_mod$lambda.min)) %>% round(3),
Lasso_1se = as.matrix(coef(Lasso_mod, s = Lasso_mod$lambda.1se)) %>% round(3)) %>%
rename(varname = 1, Lasso_min = 2, Lasso_1se = 3) %>% remove_rownames()
coef_mat## [1] 0.00 0.25 0.50 0.75 1.00
## Call:
## cva.glmnet.formula(formula = profitM ~ ., data = movies_train,
## alpha = alpha_list)
##
## Model fitting options:
## Sparse model matrix: FALSE
## Use model.frame: FALSE
## Alpha values: 0 0.25 0.5 0.75 1
## Number of crossvalidation folds for lambda: 10
## 29 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 967.491
## color .
## color Black and White .
## colorColor .
## num_critic_for_reviews .
## duration -0.057
## director_facebook_likes .
## actor_3_facebook_likes 0.001
## actor_1_facebook_likes .
## num_voted_users 0.000
## cast_total_facebook_likes .
## facenumber_in_poster .
## num_user_for_reviews .
## content_ratingPG 5.704
## content_ratingPG-13 .
## content_ratingR -4.308
## content_ratingOther .
## title_year -0.478
## actor_2_facebook_likes .
## imdb_score .
## aspect_ratio -2.778
## movie_facebook_likes .
## genre_mainAction -5.158
## genre_mainAdventure .
## genre_mainComedy 3.928
## genre_mainCrime .
## genre_mainDrama .
## genre_mainOther .
## cast_total_facebook_likes000s .
# if we want the lambda.min version
coef(enet_fit, alpha = 0.75, s = enet_fit$modlist[[4]]$lambda.min)## 29 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 1704.8100887902
## color 8.3117548645
## color Black and White -14.1284756012
## colorColor .
## num_critic_for_reviews -0.0119109398
## duration -0.2792141763
## director_facebook_likes -0.0005735128
## actor_3_facebook_likes -0.0098836808
## actor_1_facebook_likes -0.0083783098
## num_voted_users 0.0001680881
## cast_total_facebook_likes 0.0056239902
## facenumber_in_poster 0.2695339896
## num_user_for_reviews 0.0076388802
## content_ratingPG 10.8481088871
## content_ratingPG-13 .
## content_ratingR -7.3148179137
## content_ratingOther 1.8032278465
## title_year -0.8438615745
## actor_2_facebook_likes -0.0085528071
## imdb_score 3.1407490708
## aspect_ratio -5.7212843610
## movie_facebook_likes 0.0001285754
## genre_mainAction -11.8516721784
## genre_mainAdventure -2.8045684545
## genre_mainComedy 6.0821880425
## genre_mainCrime -6.8999275318
## genre_mainDrama .
## genre_mainOther 5.3565127462
## cast_total_facebook_likes000s 2.6695073318
## 29 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 987.326
## color .
## color Black and White .
## colorColor .
## num_critic_for_reviews .
## duration -0.065
## director_facebook_likes .
## actor_3_facebook_likes 0.001
## actor_1_facebook_likes .
## num_voted_users 0.000
## cast_total_facebook_likes .
## facenumber_in_poster .
## num_user_for_reviews .
## content_ratingPG 5.781
## content_ratingPG-13 .
## content_ratingR -4.376
## content_ratingOther .
## title_year -0.488
## actor_2_facebook_likes .
## imdb_score .
## aspect_ratio -2.758
## movie_facebook_likes .
## genre_mainAction -5.369
## genre_mainAdventure .
## genre_mainComedy 3.998
## genre_mainCrime .
## genre_mainDrama .
## genre_mainOther .
## cast_total_facebook_likes000s .
enet_coefs <- data.frame(varname = rownames(coef(enet_fit, alpha = 0)), ridge = as.matrix(coef(enet_fit,
alpha = 0)) %>% round(3), alpha025 = as.matrix(coef(enet_fit, alpha = 0.25)) %>%
round(3), alpha05 = as.matrix(coef(enet_fit, alpha = 0.5)) %>% round(3),
alpha075 = as.matrix(coef(enet_fit, alpha = 0.75)) %>% round(3), lasso = as.matrix(coef(enet_fit,
alpha = 1)) %>% round(3)) %>% rename(varname = 1, ridge = 2, alpha025 = 3,
alpha05 = 4, alpha075 = 5, lasso = 6) %>% remove_rownames()## 1 2 3 4 5
## 0.0013056797 0.0021125949 0.0085947405 0.0004344368 0.0017769574
## 6
## 0.0037041528
library("caret")
confusionMatrix(factor(ifelse(preds_DF$scores_mod1 > 0.5, "Yes", "No"), levels = c("Yes",
"No")), factor(preds_DF$default, levels = c("Yes", "No")))## Confusion Matrix and Statistics
##
## Reference
## Prediction Yes No
## Yes 100 42
## No 233 9625
##
## Accuracy : 0.9725
## 95% CI : (0.9691, 0.9756)
## No Information Rate : 0.9667
## P-Value [Acc > NIR] : 0.0004973
##
## Kappa : 0.4093
##
## Mcnemar's Test P-Value : < 0.00000000000000022
##
## Sensitivity : 0.3003
## Specificity : 0.9957
## Pos Pred Value : 0.7042
## Neg Pred Value : 0.9764
## Prevalence : 0.0333
## Detection Rate : 0.0100
## Detection Prevalence : 0.0142
## Balanced Accuracy : 0.6480
##
## 'Positive' Class : Yes
##
TrainDF <- data.frame(default = c(Default$default, Default$default), scores = c(preds_DF$scores_mod1,
preds_DF$scores_mod2), models = c(rep("X = Student", length(preds_DF$scores_mod1)),
rep("X = Student + Balance + Income", length(preds_DF$scores_mod2))))
library(ggplot2)
library("plotROC")
TrainROC <- ggplot(TrainDF, aes(m = scores, d = default, color = models)) +
geom_roc(show.legend = TRUE, labelsize = 3.5, cutoffs.at = c(0.99, 0.9,
0.7, 0.5, 0.3, 0.1, 0))
TrainROC <- TrainROC + style_roc(theme = theme_grey) + theme(axis.text = element_text(colour = "blue")) +
theme(legend.justification = c(1, 0), legend.position = c(1, 0), legend.box.margin = margin(c(50,
50, 50, 50)))
plot(TrainROC)## Warning in verify_d(data$d): D not labeled 0/1, assuming 1 = 0 and 2 = 1!
load library(plyr) before library(tidyverse)
scores3DF <- data.frame(default = ifelse(Default$default == "Yes", 1, 0), scores = preds_DF$scores_mod2)
calData <- ddply(scores3DF, .(cut(scores3DF$scores, c(0, 0.05, 0.15, 0.25, 0.35,
0.45, 0.55, 0.65, 0.75, 0.85, 0.95, 1))), colwise(mean))
calData$midpoint <- c(0.025, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.975)
colnames(calData) <- c("preds", "true", "midpoint")
calPlot <- ggplot(calData, aes(x = midpoint, y = true)) + geom_point() + ylim(0,
1) + geom_abline(intercept = 0, slope = 1, color = "red") + xlab("Prediction midpoint") +
ylab("Observed event percentage")
plot(calPlot)R uses factors to handle categorical variables, variables that have a fixed and known set of possible values. Factors are also helpful for reordering character vectors to improve display. The goal of the forcats package is to provide a suite of tools that solve common problems with factors, including changing the order of levels or the values. Some examples include:
fct_reorder(): Reordering a factor by another variable.fct_infreq(): Reordering a factor by the frequency of values.fct_relevel(): Changing the order of a factor by hand.fct_lump(): Collapsing the least/most frequent values of a factor into “other”.